<--- %%NOBANNER%% --> doubles.sas
 BackForward

/*-------------------------------------------------------------------------------\
| SOURCE:   doubles.sas                                                          |
|                                                                                |
| VERSION:  RDS v3.0                                                             |
|                                                                                |
| PURPOSE:  This macro is designed to check for exact duplicate and/or near      |
|           duplicate observations in a SAS dataset.  It generates a             |
|           printout of near duplicates for further manual examination and       |
|           notes in the log changes made to the output dataset.                 |
|                                                                                |
| SYNTAX:   %doubles (indata  = _input_dataset_,                       |
|                     outdata = _output_dataset_,                      |
|                     var     = _unique_var_list_,                     |
|                     nodup   = _yes_or_no_)                           |
|                                                                                |
\-------------------------------------------------------------------------------*/
%macro doubles (indata=, outdata=, var=, nodup=yes) /des='doubles identifier' ;
/*-------------------------------------------------------------\
| turn notes off, begin time stamp, globalize and localize all |
| variables, calculate key macro variables.                    |
\-------------------------------------------------------------*/
%let notes=%sysfunc(getoption(notes,keyword)); options nonotes;
%let starttime = %sysfunc(datetime());
%local dataset var varnum sortvar varcnt orig nodup del near notes outdata starttime i x;
%let varnum  = %words (&var); %let sortvar = %scan(&var, &varnum);
%if %upcase(%substr(&nodup, 1, 1)) = Y %then %let nodup = nodup noequals;
%else %let nodup = %str();

/*--------------------------------------------------\
| verify that variable named X is not in input data |
| if so, find another variable name!                |
+---------------------------------------------------+-------------------+
| in THEORY, this will not work all the time... but someone has to have |
| basically be using variales named x0-xINFINITY before it will fail.   |
| If you have that many variables in your dataset, this utility will    |
| not be practical and you should code your own-- or change KEYVAR = X  |
| to KEYVAR = Y                                                         |
\----------------------------------------------------------------------*/
proc contents data=&indata noprint out=contents (keep = name) ;
run ;

%let x = x0 ;
%do i = 1 %to %eval(%nobs(&indata)+1) ;
   data _null_ ;
      set contents nobs=nobs ;
      /*-------------------------------------------------------\
      | if we encounter the variable name, then increment by 1 |
      | and kick out of the data step, going through the loop  |
      | again, searching for a safe variable name              |
      \-------------------------------------------------------*/
      if upcase(name) = "%upcase(&x)" then do ;
         call symput ('x', 'x'||trim(left(put(_n_, best.)))) ;
         stop ;
      end ;
      /*----------------------------------------------------\
      | if the variable name is safe we can kick out of the |
      | macro DO loop                                       |
      \----------------------------------------------------*/
      else if _n_ = nobs then call symput ('i', %nobs(&indata) + 1) ;
   run ;
%end ;

/*---------------------------------------------------------\
| sort dataset by &var, remove exact duplicates if desired |
\---------------------------------------------------------*/
proc sort data=&indata &nodup out=&outdata (label = "DOUBLES macro output for %data(&indata)"); by &var; run;
%if %nobs(&indata) ne %nobs(&outdata) %then %do ;
   %let delete = %eval(%nobs(&indata) - %nobs(&outdata)) ;
   options ¬es ;
   %put NOTE: &delete duplicate observations were deleted. ;
   options nonotes ;
%end ;

/*--------------------------------------------------------\
| create first and second temporary datasets for use with |
| proc compare.                                           |
\--------------------------------------------------------*/
data temp1 temp2;
   set &outdata;
   by &var;
   if first.&sortvar then output temp1;
   else output temp2;
run;

proc compare base=temp1 compare=temp2 noprint out=a outnoequal; by &var; run;

/*-----------------------------------------\
| add a marking variable to key dataset t1 |
\-----------------------------------------*/
data t1(keep=&var &x) /view=t1; set a; &x = '|'; run;
/*-----------------------------------------\
| use key duplicate dataset to check temp1 |
\-----------------------------------------*/
data checka /view=checka; merge t1 temp1; by &var; run;

/*-----------------------------------------\
| use key duplicate dataset to check temp2 |
\-----------------------------------------*/
data checkb /view=checkb; merge t1 temp2; by &var; run;

/*---------------------------------------------------------\
| merge data together and keep if data is a near duplicate |
\---------------------------------------------------------*/
data check; set checka checkb; by &var; if &x = 'x'; run;

/*-------------------------\
| print the duplicate data |
\-------------------------*/
proc print data=check; run;

/*--------------------------\
| delete temporary datasets |
\--------------------------*/
proc datasets library=macro memtype=(data view) nolist;
   delete a temp1 temp2 t1 checka checkb check; run;quit;

/*-----------------------------------\
| Prepare sas session for open code. |
\-----------------------------------*/
options ¬es; %doubles (&outdata)
options nonotes; %timenote (macro=doubles, starttime=&starttime);
%mend doubles ;